{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Chapter 6 - Solving Regression Problems with Machine Learning"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 6.1. Preparing Data for Regression Problems"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\usman\\Anaconda3\\lib\\site-packages\\seaborn\\utils.py:376: UserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system (\"lxml\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n",
      "\n",
      "The code that caused this warning is on line 376 of the file C:\\Users\\usman\\Anaconda3\\lib\\site-packages\\seaborn\\utils.py. To get rid of this warning, pass the additional argument 'features=\"lxml\"' to the BeautifulSoup constructor.\n",
      "\n",
      "  gh_list = BeautifulSoup(http)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "['anagrams',\n",
       " 'anscombe',\n",
       " 'attention',\n",
       " 'brain_networks',\n",
       " 'car_crashes',\n",
       " 'diamonds',\n",
       " 'dots',\n",
       " 'exercise',\n",
       " 'flights',\n",
       " 'fmri',\n",
       " 'gammas',\n",
       " 'geyser',\n",
       " 'iris',\n",
       " 'mpg',\n",
       " 'penguins',\n",
       " 'planets',\n",
       " 'tips',\n",
       " 'titanic']"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import seaborn as sns\n",
    "sns.get_dataset_names()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>total_bill</th>\n",
       "      <th>tip</th>\n",
       "      <th>sex</th>\n",
       "      <th>smoker</th>\n",
       "      <th>day</th>\n",
       "      <th>time</th>\n",
       "      <th>size</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>16.99</td>\n",
       "      <td>1.01</td>\n",
       "      <td>Female</td>\n",
       "      <td>No</td>\n",
       "      <td>Sun</td>\n",
       "      <td>Dinner</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>10.34</td>\n",
       "      <td>1.66</td>\n",
       "      <td>Male</td>\n",
       "      <td>No</td>\n",
       "      <td>Sun</td>\n",
       "      <td>Dinner</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>21.01</td>\n",
       "      <td>3.50</td>\n",
       "      <td>Male</td>\n",
       "      <td>No</td>\n",
       "      <td>Sun</td>\n",
       "      <td>Dinner</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>23.68</td>\n",
       "      <td>3.31</td>\n",
       "      <td>Male</td>\n",
       "      <td>No</td>\n",
       "      <td>Sun</td>\n",
       "      <td>Dinner</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>24.59</td>\n",
       "      <td>3.61</td>\n",
       "      <td>Female</td>\n",
       "      <td>No</td>\n",
       "      <td>Sun</td>\n",
       "      <td>Dinner</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   total_bill   tip     sex smoker  day    time  size\n",
       "0       16.99  1.01  Female     No  Sun  Dinner     2\n",
       "1       10.34  1.66    Male     No  Sun  Dinner     3\n",
       "2       21.01  3.50    Male     No  Sun  Dinner     3\n",
       "3       23.68  3.31    Male     No  Sun  Dinner     2\n",
       "4       24.59  3.61  Female     No  Sun  Dinner     4"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tips_df = sns.load_dataset(\"tips\")\n",
    "tips_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>carat</th>\n",
       "      <th>cut</th>\n",
       "      <th>color</th>\n",
       "      <th>clarity</th>\n",
       "      <th>depth</th>\n",
       "      <th>table</th>\n",
       "      <th>price</th>\n",
       "      <th>x</th>\n",
       "      <th>y</th>\n",
       "      <th>z</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>0.23</td>\n",
       "      <td>Ideal</td>\n",
       "      <td>E</td>\n",
       "      <td>SI2</td>\n",
       "      <td>61.5</td>\n",
       "      <td>55.0</td>\n",
       "      <td>326</td>\n",
       "      <td>3.95</td>\n",
       "      <td>3.98</td>\n",
       "      <td>2.43</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>0.21</td>\n",
       "      <td>Premium</td>\n",
       "      <td>E</td>\n",
       "      <td>SI1</td>\n",
       "      <td>59.8</td>\n",
       "      <td>61.0</td>\n",
       "      <td>326</td>\n",
       "      <td>3.89</td>\n",
       "      <td>3.84</td>\n",
       "      <td>2.31</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>0.23</td>\n",
       "      <td>Good</td>\n",
       "      <td>E</td>\n",
       "      <td>VS1</td>\n",
       "      <td>56.9</td>\n",
       "      <td>65.0</td>\n",
       "      <td>327</td>\n",
       "      <td>4.05</td>\n",
       "      <td>4.07</td>\n",
       "      <td>2.31</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>0.29</td>\n",
       "      <td>Premium</td>\n",
       "      <td>I</td>\n",
       "      <td>VS2</td>\n",
       "      <td>62.4</td>\n",
       "      <td>58.0</td>\n",
       "      <td>334</td>\n",
       "      <td>4.20</td>\n",
       "      <td>4.23</td>\n",
       "      <td>2.63</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>0.31</td>\n",
       "      <td>Good</td>\n",
       "      <td>J</td>\n",
       "      <td>SI2</td>\n",
       "      <td>63.3</td>\n",
       "      <td>58.0</td>\n",
       "      <td>335</td>\n",
       "      <td>4.34</td>\n",
       "      <td>4.35</td>\n",
       "      <td>2.75</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   carat      cut color clarity  depth  table  price     x     y     z\n",
       "0   0.23    Ideal     E     SI2   61.5   55.0    326  3.95  3.98  2.43\n",
       "1   0.21  Premium     E     SI1   59.8   61.0    326  3.89  3.84  2.31\n",
       "2   0.23     Good     E     VS1   56.9   65.0    327  4.05  4.07  2.31\n",
       "3   0.29  Premium     I     VS2   62.4   58.0    334  4.20  4.23  2.63\n",
       "4   0.31     Good     J     SI2   63.3   58.0    335  4.34  4.35  2.75"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "diamond_df = sns.load_dataset(\"diamonds\")\n",
    "diamond_df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 6.1.1. Dividing Data into Features and Labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = tips_df.drop(['tip'], axis=1)\n",
    "y = tips_df[\"tip\"]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>total_bill</th>\n",
       "      <th>sex</th>\n",
       "      <th>smoker</th>\n",
       "      <th>day</th>\n",
       "      <th>time</th>\n",
       "      <th>size</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>16.99</td>\n",
       "      <td>Female</td>\n",
       "      <td>No</td>\n",
       "      <td>Sun</td>\n",
       "      <td>Dinner</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>10.34</td>\n",
       "      <td>Male</td>\n",
       "      <td>No</td>\n",
       "      <td>Sun</td>\n",
       "      <td>Dinner</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>21.01</td>\n",
       "      <td>Male</td>\n",
       "      <td>No</td>\n",
       "      <td>Sun</td>\n",
       "      <td>Dinner</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>23.68</td>\n",
       "      <td>Male</td>\n",
       "      <td>No</td>\n",
       "      <td>Sun</td>\n",
       "      <td>Dinner</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>24.59</td>\n",
       "      <td>Female</td>\n",
       "      <td>No</td>\n",
       "      <td>Sun</td>\n",
       "      <td>Dinner</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   total_bill     sex smoker  day    time  size\n",
       "0       16.99  Female     No  Sun  Dinner     2\n",
       "1       10.34    Male     No  Sun  Dinner     3\n",
       "2       21.01    Male     No  Sun  Dinner     3\n",
       "3       23.68    Male     No  Sun  Dinner     2\n",
       "4       24.59  Female     No  Sun  Dinner     4"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    1.01\n",
       "1    1.66\n",
       "2    3.50\n",
       "3    3.31\n",
       "4    3.61\n",
       "Name: tip, dtype: float64"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 6.1.2. Converting Categorical Data to Numbers\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "numerical = X.drop(['sex', 'smoker', 'day', 'time'], axis = 1)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>total_bill</th>\n",
       "      <th>size</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>16.99</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>10.34</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>21.01</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>23.68</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>24.59</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   total_bill  size\n",
       "0       16.99     2\n",
       "1       10.34     3\n",
       "2       21.01     3\n",
       "3       23.68     2\n",
       "4       24.59     4"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "numerical.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>sex</th>\n",
       "      <th>smoker</th>\n",
       "      <th>day</th>\n",
       "      <th>time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>Female</td>\n",
       "      <td>No</td>\n",
       "      <td>Sun</td>\n",
       "      <td>Dinner</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>Male</td>\n",
       "      <td>No</td>\n",
       "      <td>Sun</td>\n",
       "      <td>Dinner</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>Male</td>\n",
       "      <td>No</td>\n",
       "      <td>Sun</td>\n",
       "      <td>Dinner</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>Male</td>\n",
       "      <td>No</td>\n",
       "      <td>Sun</td>\n",
       "      <td>Dinner</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>Female</td>\n",
       "      <td>No</td>\n",
       "      <td>Sun</td>\n",
       "      <td>Dinner</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      sex smoker  day    time\n",
       "0  Female     No  Sun  Dinner\n",
       "1    Male     No  Sun  Dinner\n",
       "2    Male     No  Sun  Dinner\n",
       "3    Male     No  Sun  Dinner\n",
       "4  Female     No  Sun  Dinner"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "categorical = X.filter(['sex', 'smoker', 'day', 'time'])\n",
    "categorical.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>sex_Female</th>\n",
       "      <th>smoker_No</th>\n",
       "      <th>day_Fri</th>\n",
       "      <th>day_Sat</th>\n",
       "      <th>day_Sun</th>\n",
       "      <th>time_Dinner</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   sex_Female  smoker_No  day_Fri  day_Sat  day_Sun  time_Dinner\n",
       "0           1          1        0        0        1            1\n",
       "1           0          1        0        0        1            1\n",
       "2           0          1        0        0        1            1\n",
       "3           0          1        0        0        1            1\n",
       "4           1          1        0        0        1            1"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "cat_numerical = pd.get_dummies(categorical,drop_first=True)\n",
    "cat_numerical.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>total_bill</th>\n",
       "      <th>size</th>\n",
       "      <th>sex_Female</th>\n",
       "      <th>smoker_No</th>\n",
       "      <th>day_Fri</th>\n",
       "      <th>day_Sat</th>\n",
       "      <th>day_Sun</th>\n",
       "      <th>time_Dinner</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>16.99</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>10.34</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>21.01</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>23.68</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>24.59</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   total_bill  size  sex_Female  smoker_No  day_Fri  day_Sat  day_Sun  \\\n",
       "0       16.99     2           1          1        0        0        1   \n",
       "1       10.34     3           0          1        0        0        1   \n",
       "2       21.01     3           0          1        0        0        1   \n",
       "3       23.68     2           0          1        0        0        1   \n",
       "4       24.59     4           1          1        0        0        1   \n",
       "\n",
       "   time_Dinner  \n",
       "0            1  \n",
       "1            1  \n",
       "2            1  \n",
       "3            1  \n",
       "4            1  "
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X = pd.concat([numerical, cat_numerical], axis = 1)\n",
    "X.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 6.1.3. Divide Data into Training and Test Sets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y,  test_size=0.20, random_state=0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 6.1.4. Data Scaling/Normalization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import StandardScaler\n",
    "sc = StandardScaler()\n",
    "X_train = sc.fit_transform(X_train)\n",
    "X_test = sc.transform (X_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 6.2. Linear Regression "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Mean Absolute Error: 0.7080218832979829\n",
      "Mean Squared Error: 0.893919522160961\n",
      "Root Mean Squared Error: 0.9454731736865732\n"
     ]
    }
   ],
   "source": [
    "from sklearn.linear_model import LinearRegression\n",
    "\n",
    "lin_reg = LinearRegression()\n",
    "regressor = lin_reg.fit(X_train, y_train)\n",
    "\n",
    "y_pred = regressor.predict(X_test)\n",
    "\n",
    "\n",
    "from sklearn import metrics\n",
    "\n",
    "print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))\n",
    "print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))\n",
    "print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 6.3. KNN Regression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Mean Absolute Error: 0.7513877551020406\n",
      "Mean Squared Error: 0.9462902040816326\n",
      "Root Mean Squared Error: 0.9727744877830794\n"
     ]
    }
   ],
   "source": [
    "from sklearn.neighbors import KNeighborsRegressor\n",
    "knn_reg = KNeighborsRegressor(n_neighbors=5)\n",
    "regressor = knn_reg.fit(X_train, y_train)\n",
    "\n",
    "y_pred = regressor.predict(X_test)\n",
    "\n",
    "\n",
    "from sklearn import metrics\n",
    "\n",
    "print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))\n",
    "print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))\n",
    "print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 6.4. Random Forest Regression "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Mean Absolute Error: 0.7054065306122449\n",
      "Mean Squared Error: 0.8045782841306138\n",
      "Root Mean Squared Error: 0.8969828783932354\n"
     ]
    }
   ],
   "source": [
    "from sklearn.ensemble import RandomForestRegressor\n",
    "rf_reg = RandomForestRegressor(random_state=42, n_estimators=500)\n",
    "regressor = rf_reg.fit(X_train, y_train)\n",
    "y_pred = regressor.predict(X_test)\n",
    "\n",
    "\n",
    "from sklearn import metrics\n",
    "\n",
    "print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))\n",
    "print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))\n",
    "print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 6.5. Support Vector Regression\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Mean Absolute Error: 0.7362521512772694\n",
      "Mean Squared Error: 0.9684825097223093\n",
      "Root Mean Squared Error: 0.9841150896731079\n"
     ]
    }
   ],
   "source": [
    "from sklearn import svm\n",
    "svm_reg = svm.SVR()\n",
    "\n",
    "regressor = svm_reg.fit(X_train, y_train)\n",
    "y_pred = regressor.predict(X_test)\n",
    "\n",
    "\n",
    "from sklearn import metrics\n",
    "\n",
    "print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))\n",
    "print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))\n",
    "print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 6.6. K Fold Cross Validation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[-0.66386205 -0.57007269 -0.63598762 -0.96960743 -0.87391702]\n"
     ]
    }
   ],
   "source": [
    "from sklearn.model_selection import cross_val_score\n",
    "\n",
    "print(cross_val_score(regressor, X, y, cv=5, scoring =\"neg_mean_absolute_error\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 6.7. Predicting a Single Value"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "total_bill     11.35\n",
       "tip              2.5\n",
       "sex           Female\n",
       "smoker           Yes\n",
       "day              Fri\n",
       "time          Dinner\n",
       "size               2\n",
       "Name: 100, dtype: object"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tips_df.loc[100]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2.2609]\n"
     ]
    }
   ],
   "source": [
    "from sklearn.ensemble import RandomForestRegressor\n",
    "rf_reg = RandomForestRegressor(random_state=42, n_estimators=500)\n",
    "regressor = rf_reg.fit(X_train, y_train)\n",
    "\n",
    "single_record = sc.transform (X.values[100].reshape(1, -1))\n",
    "predicted_tip = regressor.predict(single_record)\n",
    "print(predicted_tip)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Exercise 6.1\n",
    "\n",
    "\n",
    "### Question 1\n",
    "\n",
    "Which of the following is an example of a regression output:\n",
    "\n",
    "A- True \\\n",
    "B- Red \\\n",
    "C- 2.5 \\\n",
    "D- None of the above\n",
    "\n",
    "Answer: C\n",
    "    \n",
    "    \n",
    "### Question 2\n",
    "\n",
    "Which of the following algorithm is a lazy algorithm?\n",
    "\n",
    "A- Random Forest \\\n",
    "B- KNN \\\n",
    "C- SVM \\\n",
    "D- Linear Regression\n",
    "\n",
    "Answer: B\n",
    "\n",
    "\n",
    "### Question 3\n",
    "\n",
    "Which of the following algorithm is not a regression metric?\n",
    "\n",
    "A- Accuracy \\\n",
    "B- Recall \\\n",
    "C- F1 Measure \\\n",
    "D- All of the above\n",
    "\n",
    "Answer: D"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Exercise 6.2\n",
    "\n",
    "Using the `diamonds` dataset from seaborn library. Train a regression algorithm of your choice which predicts the price of the diamond. Perform all the preprocessing steps"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Solution:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Mean Absolute Error: 1719.4644618294901\n",
      "Mean Squared Error: 10457583.658419697\n",
      "Root Mean Squared Error: 3233.8187423570444\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import seaborn as sns\n",
    "\n",
    "diamonds_df = sns.load_dataset(\"diamonds\")\n",
    "\n",
    "X = diamonds_df.drop(['price'], axis=1)\n",
    "y = diamonds_df[\"price\"]\n",
    "\n",
    "numerical = X.drop(['cut', 'color', 'clarity'], axis = 1)\n",
    "\n",
    "categorical = X.filter(['cut', 'color', 'clarity'])\n",
    "\n",
    "cat_numerical = pd.get_dummies(categorical,drop_first=True)\n",
    "\n",
    "X = pd.concat([numerical, cat_numerical], axis = 1)\n",
    "\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y,  test_size=0.20, random_state=0)\n",
    "\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "sc = StandardScaler()\n",
    "X_train = sc.fit_transform(X_train)\n",
    "X_test = sc.transform (X_test)\n",
    "\n",
    "from sklearn import svm\n",
    "svm_reg = svm.SVR()\n",
    "regressor = svm_reg.fit(X_train, y_train)\n",
    "y_pred = regressor.predict(X_test)\n",
    "\n",
    "\n",
    "\n",
    "from sklearn import metrics\n",
    "\n",
    "print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))\n",
    "print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))\n",
    "print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
